/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include <udm_config.h>

#include <stdlib.h>
#include <string.h>
#include <errno.h>

#include <sys/types.h>
#ifdef   HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_WINSOCK_H
#include <winsock.h>
#endif
#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_ARPA_NAMESER_H
#include <arpa/nameser.h>
#endif
#ifdef HAVE_RESOLV_H
#include <resolv.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif

#include "udm_store.h"
#include "udm_services.h"
#include "udm_xmalloc.h"
#include "udm_crc32.h"
#include "udm_utils.h"
#include "udm_log.h"
#include "udm_vars.h"
#include "udm_parsehtml.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_searchtool.h"

#ifdef HAVE_ZLIB


#ifndef HAVE_HSTRERROR
static const char *h_errlist[] = {
    "Error 0",
    "Unknown host",
    "Host name lookup failure",
    "Unknown server error",
    "No address associated with name"
};
# define hstrerror(err)  ((err) <= 4 ? h_errlist[(err)] : "unknown error")
#endif


#endif


__INDLIB__ int UdmStoreDoc(UDM_AGENT *Agent, UDM_DOCUMENT *Doc) {

#ifdef HAVE_ZLIB
  const char *hello = "S\0";
  char result[8];
  int s;
  size_t content_size=Doc->Buf.size - (Doc->Buf.content-Doc->Buf.buf);
  const char *url = UdmVarListFindStr(&Doc->Sections, "URL", "");
  int rec_id=UdmCRC32(url, strlen(url));
  
  if ((s = UdmStoreOpen(Agent)) < 0)return -1;
  
  /* FIXME: add checking of send() results */
  UdmSend(s, hello, 1, 0);
  UdmSend(s, &rec_id, sizeof(rec_id), 0);
  UdmSend(s, &content_size, sizeof(content_size), 0);
  UdmSend(s, Doc->Buf.content, content_size, 0);
  
  if (UdmRecvall(s, result, 6) < 0) {
    closesocket(s);
    return -1;
  }
  closesocket(s);
  
  return memcmp(result, "STORED", 6) ? -1 : 0;
  
#else
  return -1;
#endif

}


/* Retrieve cached copy       */
/* Caller must alloc Doc->Buf.buf */

__INDLIB__ int UdmUnStoreDoc(UDM_AGENT *Agent, UDM_DOCUMENT *Doc) {
  
#ifdef HAVE_ZLIB
  const char *hello = "G\0";
  int s;
  unsigned int rec_id;
  size_t content_size=0;
  ssize_t nread;
  
  sscanf(UdmVarListFindStr(&Doc->Sections, "STORED_ID", "0"), "%u", &rec_id); /*!!! Do not replace this by UdmVarListFindInt */
  Doc->Buf.size=0;
  
  if ((s = UdmStoreOpen(Agent)) < 0) return -1;
  
  /* FIXME: add send() results checking */
  UdmSend(s, hello, 1, 0);
  UdmSend(s, &rec_id, sizeof(rec_id), 0);
  
  if (
       (UdmRecvall(s, &content_size, sizeof(content_size)) < 0) ||
       (content_size == 0)     ) 
  {
    closesocket(s);
    return -1;
  }
 
  if (Doc->Buf.buf == NULL) {
    Doc->Buf.buf = (char*) malloc(content_size + 1);
  }
  if (
      (Doc->Buf.buf == NULL) ||
       ((nread = UdmRecvall(s, Doc->Buf.buf, content_size)) < 0)
      ) {
    closesocket(s);
    return -1;
  }
  
  closesocket(s);
  Doc->Buf.buf[nread] = '\0';
  Doc->Buf.size = nread;
  Doc->Buf.content=Doc->Buf.buf;
  return 0;
  
#else
  return -1;
#endif
}

int UdmStoreOpen(UDM_AGENT *Agent) {

#ifdef HAVE_ZLIB
  struct hostent *hp;
  struct sockaddr_in server_addr;
  int s, nport = UDM_STORED_PORT;
  char * cport;
  char * stored_addr;
  const char *sa = UdmVarListFindStr(&Agent->Conf->Vars, "StoredAddr", NULL);

  if(!sa) return -1;
  stored_addr = strdup(sa);

  if((cport=strchr(stored_addr,':'))){
    *cport='\0';
    nport=atoi(cport+1);
  }

  if ((hp = gethostbyname(stored_addr)) == 0 ) {
    free(stored_addr);
    UdmLog(Agent, UDM_LOG_ERROR, "StoreOpen ERR gethostbyname: %s", hstrerror(h_errno));
    return -1;
  }
  free(stored_addr);
  bzero(&server_addr, sizeof(server_addr));
  memmove(&server_addr.sin_addr, hp->h_addr, (size_t)hp->h_length);
  server_addr.sin_family = hp->h_addrtype;
  server_addr.sin_port = htons((u_short)nport);

  if((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
    UdmLog(Agent, UDM_LOG_ERROR, "StoreOpen ERR socket: %s", strerror(errno));
    return -1;
  }
  
  if(connect(s, (struct sockaddr *)&server_addr, sizeof(server_addr)) == -1) {
    UdmLog(Agent, UDM_LOG_ERROR, "StoreOpen ERR connect to %s: %s", inet_ntoa(server_addr.sin_addr), strerror(errno));
    closesocket(s);
    return -1;
  }

  return s;

#else
  return -1;
#endif

}


__INDLIB__ int UdmStoreDeleteDoc(UDM_AGENT *Agent, UDM_DOCUMENT *Doc) {

#ifdef HAVE_ZLIB
  const char *hello = "D\0";
  int s;
  const char *url = UdmVarListFindStr(&Doc->Sections, "URL", "");
  int rec_id=UdmCRC32(url, strlen(url));

  if ((s = UdmStoreOpen(Agent)) < 0) return -1;
  
  UdmSend(s, hello, 1, 0);
  UdmSend(s, &rec_id, sizeof(rec_id), 0);
  closesocket(s);
  return 0;

#else
  return -1;
#endif

}

/* ** ** ** */
static void UdmNextCharB_stored(void *d) {
  UDM_HTMLTOK *t = (UDM_HTMLTOK *)d;
  if (!t->finished && ((t->b - t->Content) > t->chunks * UDM_DOCHUNKSIZE - 32)) {
    char *OldContent = t->Content;
    size_t ChunkSize, i;
    t->Content = (char*)realloc(t->Content, (size_t)(t->chunks + 1) * UDM_DOCHUNKSIZE + 1);
    t->chunks++;
    UdmSend(t->socket, &t->chunks, sizeof(t->chunks), 0);
    UdmRecvall(t->socket, &ChunkSize, sizeof(ChunkSize));
    UdmRecvall(t->socket, &t->Content[(t->chunks-1) * UDM_DOCHUNKSIZE], ChunkSize);
    if (ChunkSize != UDM_DOCHUNKSIZE) {
      int z = 0;
      UdmSend(t->socket, &z, sizeof(z), 0);
      t->finished = 1;
    }
    t->Content[(t->chunks-1) * UDM_DOCHUNKSIZE + ChunkSize] = '\0';
    if (t->Content != OldContent) {
      t->e = t->Content + (t->e - OldContent);
      t->b = t->Content + (t->b - OldContent);
      t->s = t->Content + (t->s - OldContent);
      *(t->lt) = t->Content + (*(t->lt) - OldContent);
      for (i = 0; i < t->ntoks; i++) {
	t->toks[i].name = (t->toks[i].name) ? t->Content + (t->toks[i].name - OldContent) : NULL;
	t->toks[i].val = (t->toks[i].val) ? t->Content + (t->toks[i].val - OldContent) : NULL;
      }
    }
  }
  (t->b)++;
}

static void UdmNextCharE_stored(void *d) {
  UDM_HTMLTOK *t = (UDM_HTMLTOK *)d;
  if (!t->finished && ((t->e - t->Content) > t->chunks * UDM_DOCHUNKSIZE - 32)) {
    char *OldContent = t->Content;
    size_t ChunkSize, i;
    t->Content = (char*)realloc(t->Content, (size_t)(t->chunks + 1) * UDM_DOCHUNKSIZE + 1);
    t->chunks++;
    UdmSend(t->socket, &t->chunks, sizeof(t->chunks), 0);
    UdmRecvall(t->socket, &ChunkSize, sizeof(ChunkSize));
    UdmRecvall(t->socket, &t->Content[(t->chunks-1) * UDM_DOCHUNKSIZE], ChunkSize);
    if (ChunkSize != UDM_DOCHUNKSIZE) {
      int z = 0;
      UdmSend(t->socket, &z, sizeof(z), 0);
      t->finished = 1;
    }
    t->Content[(t->chunks-1) * UDM_DOCHUNKSIZE + ChunkSize] = '\0';
    if (t->Content != OldContent) {
      t->e = t->Content + (t->e - OldContent);
      t->b = t->Content + (t->b - OldContent);
      t->s = t->Content + (t->s - OldContent);
      *(t->lt) = t->Content + (*(t->lt) - OldContent);
      for (i = 0; i < t->ntoks; i++) {
	t->toks[i].name = (t->toks[i].name) ? t->Content + (t->toks[i].name - OldContent) : NULL;
	t->toks[i].val = (t->toks[i].val) ? t->Content + (t->toks[i].val - OldContent) : NULL;
      }
    }
  }
  (t->e)++;
}

static int * UdmUniStrWWL(int *s, UDM_WIDEWORDLIST *wwl, int *c, size_t *len) {
  int sc;
  register size_t i;

  while((sc = *s++) != 0) {
    for(i = 0; i < wwl->nwords; i++) {
      if ((sc == c[i]) && (wwl->Word[i].origin != UDM_WORD_ORIGIN_STOP)) {
	if (memcmp(s, &(wwl->Word[i].uword[1]), len[i] * sizeof(int)) == 0) {
	  s--;
	  return s;
	}
      }
    }
  }
  return NULL;
}

static int UdmUniNSpace(int c) {
	if (c == 0x0020) return 0;
	if (c == 0x0026) return 0;
	if (c == 0x00A0) return 0;
	if (c == 0x1680) return 0;
	if ((c >= 0x2000) && (c <= 0x200B)) return 0;
	if (c == 0x202F) return 0;
	if (c == 0x3000) return 0;
	return 1;
}


__INDLIB__ char * UdmExcerptDoc(UDM_AGENT *query, UDM_RESULT *Res, UDM_DOCUMENT *Doc, size_t size) {
  char *HDoc,*HEnd;
  const char *htok, *last;
  const char *lcharset;
  UDM_CHARSET *lcs = NULL, *dcs = NULL, *sys_int;
  UDM_HTMLTOK tag;
  int *start, *end, *uni, ures, *p, *oi, dot[] = {0x2e, 0x2e, 0x2e, 0}, *np, add;
  char *os;
  int *c, s;
  size_t *wlen, i, len, maxwlen = 0, ulen, prevlen;
  UDM_CONV dc_uni, uni_lc;
  const char *hello = "E\0";
  const char *url = UdmVarListFindStr(&Doc->Sections, "URL", "");
  int rec_id = UdmCRC32(url, strlen(url));
  size_t ChunkSize, DocSize;

  if (query->Conf->lcs == NULL) {
    lcharset = UdmVarListFindStr(&query->Conf->Vars, "CS", "");
    if (lcharset == NULL || (!strcmp(lcharset, ""))) {
      lcharset = UdmVarListFindStr(&query->Conf->Vars, "LocalCharset", "iso-8859-1");
    }
    lcs = UdmGetCharSet(lcharset);
  } else {
    lcs = query->Conf->lcs;
  }
  dcs = UdmGetCharSet(UdmVarListFindStr(&Doc->Sections,"Charset","iso-8859-1"));
  
  if (!lcs || !dcs) return NULL;
  if (!(sys_int=UdmGetCharSet("sys-int")))
    return NULL;
  
  UdmConvInit(&dc_uni,dcs,sys_int,0);
  UdmConvInit(&uni_lc,sys_int,lcs,UDM_RECODE_HTML);

  c = (int *) malloc(Res->WWList.nwords * sizeof(int));
  if (c == NULL) { return NULL; }
  wlen = (size_t *) malloc(Res->WWList.nwords * sizeof(size_t));
  if (wlen == NULL) {
    UDM_FREE(c);
    return NULL;
  }
  for (i = 0; i < Res->WWList.nwords; i++) {
    wlen[i] = Res->WWList.Word[i].len - 1;
    c[i] = Res->WWList.Word[i].uword[0];
    if (wlen[i] > maxwlen) maxwlen = wlen[i];
  }
  if ((oi = (int *)malloc(512 * sizeof(int))) == NULL) {
    UDM_FREE(c); UDM_FREE(wlen);
    return NULL;
  }
  oi[0]=0;

  DocSize = UdmVarListFindInt(&Doc->Sections, "Content-Length", UDM_MAXDOCSIZE);
/*  DocSize = UDM_MAXDOCSIZE;*/

  if ((HEnd=HDoc = (char *)malloc(DocSize)) == NULL) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen);
    return NULL;
  }
  HDoc[0]='\0';

  if ( (uni = (int *)malloc((DocSize + 10) * sizeof(int)) ) == NULL) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen); UDM_FREE(HDoc);
    return NULL;
  }

  UdmHTMLTOKInit(&tag); 
  tag.next_b = &UdmNextCharB_stored;
  tag.next_e = &UdmNextCharE_stored;
  tag.chunks = 1;
  
  if ((s = tag.socket = UdmStoreOpen(query)) < 0) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen); UDM_FREE(HDoc); UDM_FREE(uni);
    return NULL;
  }

  

  UdmSend(s, hello, 1, 0);
  UdmSend(s, &rec_id, sizeof(rec_id), 0);
  UdmSend(s, &tag.chunks, sizeof(tag.chunks), 0);
  UdmRecvall(s, &ChunkSize, sizeof(ChunkSize));

  if (ChunkSize == 0) {
    tag.chunks = 0;
    UdmSend(s, &tag.chunks, sizeof(tag.chunks), 0);
    closesocket(s);
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen); UDM_FREE(HDoc); UDM_FREE(uni);
    return NULL;
  }
  tag.Content = (char*)malloc(ChunkSize+10);
  UdmRecvall(s, tag.Content, ChunkSize);
  tag.Content[ChunkSize] = '\0';


  htok = UdmHTMLToken(tag.Content, &last, &tag);
  for (len = 0; (len == 0) && htok; len = HEnd - HDoc) {
    switch(tag.type) {
    case UDM_HTML_TXT:
      memcpy(HEnd, htok, (size_t)(last-htok));
      HEnd += last - htok;
      HEnd[0] = '\0';
      len = HEnd - HDoc;
      break;
    case UDM_HTML_COM:
    case UDM_HTML_TAG:
    default:
      break;
    }
    htok = UdmHTMLToken(NULL, &last, &tag);
  }

  if (HEnd == HDoc) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen); UDM_FREE(HDoc); UDM_FREE(uni);
    if (!tag.finished) {
      tag.chunks = 0;
      UdmSend(s, &tag.chunks, sizeof(tag.chunks), 0);
    }
    closesocket(s);
    free(tag.Content);
    return NULL;
  }

  prevlen = 0;
  ulen = 0;

  add = UdmConv(&dc_uni, (char*)uni, sizeof(*uni)*DocSize, HDoc, len) / sizeof(*uni);
  UdmUniStrToLower(uni);
  prevlen = len;
  ulen += add;

  
  for (p = uni; UdmUniLen(oi) < 256; ) {


    while((np  = UdmUniStrWWL(p, &(Res->WWList), c, wlen)) == NULL) {

      while(htok && (len == prevlen)) {
	switch(tag.type) {
	case UDM_HTML_TXT:
	  memcpy(HEnd, htok, (size_t)(last-htok));
	  HEnd += last - htok;
	  HEnd[0] = '\0';
	  len = HEnd - HDoc;
	  break;
	case UDM_HTML_COM:
	case UDM_HTML_TAG:
	default:
	  break;
	}
	htok = UdmHTMLToken(NULL, &last, &tag);
      }

      if (len == prevlen) break;

      add = UdmConv(&dc_uni, (char*)(uni + ulen), sizeof(*uni)*(DocSize - ulen), HDoc + prevlen, len - prevlen) / sizeof(*uni);
      UdmUniStrToLower(uni + ulen);
      prevlen = len;
      p = udm_max(uni, uni + ulen - maxwlen);
      ulen += add;

    }
    if (np == NULL) break;
    p = np;
    if ( ( (p > uni) && (!UdmUniNSpace(*(p-1))) ) || (p == uni)  ) {
      start = udm_max(p - 64, uni);
      end = udm_min(p + 64, uni + len);
      while(UdmUniNSpace(*start) && (start < end)) start++;
      while(UdmUniNSpace(*end) && (start < end)) end--;
      if (start != uni) UdmUniStrCat(oi, dot);
      ures = *end; *end = 0; UdmUniStrCat(oi, start); *end = ures;
      if (end != uni + len) UdmUniStrCat(oi, dot);
      p = end;
    } else p++;
  }


  if ((os = (char *)malloc(5120 * sizeof(char))) == NULL) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen); UDM_FREE(HDoc); UDM_FREE(uni);
    closesocket(s);
    free(tag.Content);
    return NULL;
  }
  
  UdmConv(&uni_lc,os,sizeof(*os)*5119,(char*)oi,sizeof(*oi)*UdmUniLen(oi));
  os[5119]='\0';
  
  {
    register char *cc;
    while ((cc = strchr(os, '\n')) != NULL) {
      *cc = ' ';
    }
    while ((cc = strchr(os, '\r')) != NULL) {
      *cc = ' ';
    }
    while ((cc = strchr(os, '\t')) != NULL) {
      *cc = ' ';
    }
  }
  if (!tag.finished) {
    tag.chunks = 0;
    UdmSend(s, &tag.chunks, sizeof(tag.chunks), 0);
  }
  UDM_FREE(c); UDM_FREE(wlen); UDM_FREE(oi); UDM_FREE(HDoc); UDM_FREE(uni);
  closesocket(s);
  free(tag.Content);
  return os;
}

